In [1]:
cd "E:\research\multisequenceanddcafrustratometry"


E:\research\multisequenceanddcafrustratometry

Get list of all PFAM families


In [17]:
families = pfam_get_list_of_families()



In [39]:
pfam_download_family_xml_files(families)

In [102]:
accessions = pfam_get_accessions(families)

Find list of PFAM "Domains" that match our criteria


In [127]:
domain_accessions = pfam_find_domains_matching_criteria(accessions)



In [128]:
len(domain_accessions)


Out[128]:
1845

Download seed and full alignments corresponding to the selected domains


In [191]:
pfam_download_alignments(domain_accessions, "seed")

In [ ]:
pfam_download_alignments(domain_accessions, "full")

In [200]:
for accession in domain_accessions:
    gzip_decompress("%s_full.aln.gz" % accession)

Find the longest representative structure in each PFAM family


In [177]:
structures = pfam_parse_pdb_file('pdb_pfamA_reg.txt')



In [222]:
domain_structures = {}
for accession in domain_accessions:
    domain_structures[accession] = structures[accession]

In [ ]:
longest_structures = pfam_find_longest_structure(domain_structures, download_pdbs=True)

In [235]:
len(list(set([x[0] for x in longest_domains.values()])))


Out[235]:
1728

Create truncated versions of the pdbs corresponding to the longest domains


In [2]:
families = pfam_get_list_of_families()

In [3]:
accessions = pfam_get_accessions(families)

In [4]:
domain_accessions = pfam_find_domains_matching_criteria(accessions)

In [5]:
structures = pfam_parse_pdb_file('pdb_pfamA_reg.txt')

In [6]:
domain_structures = {}
for accession in domain_accessions:
    domain_structures[accession] = structures[accession]

In [ ]:
longest_domains, no_pdb_families = pfam_find_longest_structure(domain_structures, download_pdbs=True)

In [34]:
no_pdb_families


Out[34]:
['PF08079', 'PF14204']

In [35]:
# Remove those domains that don't have any structures available in PDB format
for family in no_pdb_families:
    domain_accessions.remove(family)
    del domain_structures[family]

In [3]:
# Save dictionaries and lists to files
# pickle.dump(longest_domains, open("longest_domains.pkl", "wb"))
longest_domains = pickle.load(open("longest_domains.pkl", "rb"))

In [ ]:
io = PDBIO()
for domain in longest_domains.values():
    pdb_id, selected_chain, selected_start, selected_end = domain
    selected_start = int(selected_start)
    selected_end = int(selected_end)
    structure = parse_pdb("%s" % pdb_id)
    structure_selector = pfam_is_in_structure_selection(selected_chain, selected_start, selected_end)
    io.set_structure(structure)
    io.save("%s%s_%d-%d.pdb" % (pdb_id, selected_chain, selected_start, selected_end), select=structure_selector)

In [18]:
too_short_structures = []
for accession, domain in longest_domains.items():
     length = int(domain[3]) - int(domain[2]) + 1
     if length < 50:
         too_short_structures.append(accession)
            
for accession in too_short_structures:
    del longest_domains[accession]

In [19]:
# Save dictionaries and lists to files
pickle.dump(longest_domains, open("longest_domains.pkl", "wb"))
longest_domains = pickle.load(open("longest_domains.pkl", "rb"))

Extract sequences from trimmed pdb structures and write to fasta formatted files


In [24]:
for domain in longest_domains.values():
    pdb_id, selected_chain, selected_start, selected_end = domain
    structure = parse_pdb("%s%s_%d-%d" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
    sequence = get_sequence_from_structure(structure)
    output_file = open("%s%s_%d-%d.fasta" % (pdb_id, selected_chain, int(selected_start), int(selected_end)), "w")
    output_file.write(">%s%s_%d-%d\n" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
    output_file.write(sequence)
    output_file.close()

Download HMMs


In [26]:
for accession, domain in longest_domains.items():
    pfam_download_hmm(accession)

Align sequences extracted from structures to family using downloaded HMMs


In [89]:
invalid_accessions = []
for accession, domain in longest_domains.items():
    pdb_id, selected_chain, selected_start, selected_end = domain
    try:
        pfam_align_sequence_to_family("%s%s_%s-%s.fasta" % (pdb_id, selected_chain, selected_start, selected_end), accession)
    except:
        invalid_accessions.append(accession)

In [92]:
# remove CA-only and EM structures
for accession in invalid_accessions:
    del longest_domains[accession]

In [13]:
# Save dictionaries and lists to files
#pickle.dump(longest_domains, open("longest_domains.pkl", "wb"))
longest_domains = pickle.load(open("longest_domains.pkl", "rb"))



In [94]:
len(longest_domains)


Out[94]:
1815

Build multiple sequence files for the PDB structures and exclude highly gapped sequences


In [40]:
for accession, domain in longest_domains.items():
    pdb_id, selected_chain, selected_start, selected_end = domain
    pfam_filter_full_alignment_by_pdb_alignment(accession, "%s%s_%s-%s.aln" % (pdb_id, selected_chain, selected_start, selected_end), filter_percentage=0.05)


Collect information on each domain and the corresponding set of sequences


In [51]:
# number of domains
# lengths of domains
# number of sequences
# mean sequence entropies

In [42]:
len(longest_domains)


Out[42]:
1815

In [44]:
lengths_of_domains = []
for domain in list(longest_domains.values()):
    pdb_id, selected_chain, selected_start, selected_end = domain
    structure = parse_pdb("%s%s_%d-%d" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
    sequence = get_sequence_from_structure(structure)
    lengths_of_domains.append(len(sequence))

In [50]:
num_sequences = []
mean_sequence_entropies = []
for domain in list(longest_domains.values()):
    pdb_id, selected_chain, selected_start, selected_end = domain
    # num_lines = count_lines_in_file("%s%s_%d-%d_filtered_0.05.seqs" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
    frequency_matrix, sequence_entropy, number_of_sequences, sequence_length, mean_sequence_entropy = compute_average_sequence_and_sequence_entropy("%s%s_%d-%d_filtered_0.05.seqs" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
    num_sequences.append(number_of_sequences)
    mean_sequence_entropies.append(mean_sequence_entropy)

In [55]:
fig = plotly_histogram(lengths_of_domains)
py.iplot(fig)


Out[55]:

In [56]:
fig = plotly_histogram(num_sequences)
py.iplot(fig)


Out[56]:

In [57]:
fig = plotly_histogram(mean_sequence_entropies)
py.iplot(fig)


Out[57]:

In [81]:
fig = plotly_scatter(num_sequences, mean_sequence_entropies)
py.iplot(fig)


Out[81]:

In [82]:
fig = plotly_scatter(lengths_of_domains,num_sequences, color=mean_sequence_entropies)
py.iplot(fig)


Out[82]:

In [66]:
# total number of sequences
sum(num_sequences)


Out[66]:
13694819

In [83]:
# most number of sequences for a single domain
list(longest_domains.items())[np.argmax(num_sequences)]


Out[83]:
('PF00005', ('2VF8', 'B', '513', '762'))

In [84]:
# longest single domain
list(longest_domains.items())[np.argmax(lengths_of_domains)]


Out[84]:
('PF00723', ('1ULV', 'A', '281', '956'))

In [85]:
num_sequences[np.argmax(lengths_of_domains)]


Out[85]:
4318

In [91]:
sorted_domains, sorted_lengths = sort_list_by_other_list(longest_domains.items(), lengths_of_domains)

In [95]:
list(zip(sorted_lengths[-50:], sorted_domains[-50:]))


Out[95]:
[(419, ('PF00310', ('2VDC', 'F', '1', '419'))),
 (420, ('PF05577', ('4EBB', 'B', '38', '465'))),
 (421, ('PF02897', ('2XE4', 'A', '27', '447'))),
 (427, ('PF07517', ('3DIN', 'B', '1', '427'))),
 (429, ('PF00728', ('1QBB', 'A', '338', '766'))),
 (430, ('PF00006', ('1JVA', 'B', '279', '741'))),
 (433, ('PF01474', ('5EX4', 'B', '25', '457'))),
 (433, ('PF02347', ('1WYV', 'G', '1', '433'))),
 (435, ('PF02934', ('1ZQ1', 'D', '15', '449'))),
 (435, ('PF15902', ('3WSZ', 'A', '133', '586'))),
 (440, ('PF01979', ('2FVM', 'D', '53', '502'))),
 (441, ('PF01964', ('4S2A', 'A', '132', '576'))),
 (443, ('PF00152', ('4RMF', 'A', '116', '558'))),
 (451, ('PF01276', ('2VYC', 'J', '140', '590'))),
 (452, ('PF00580', ('4CEJ', 'A', '11', '462'))),
 (454, ('PF00450', ('1AC5', 'A', '15', '468'))),
 (454, ('PF00704', ('4TXG', 'A', '155', '608'))),
 (456, ('PF00145', ('4YOC', 'A', '1139', '1594'))),
 (456, ('PF00148', ('4WN9', 'C', '53', '508'))),
 (459, ('PF00067', ('5JLC', 'A', '57', '524'))),
 (460, ('PF01019', ('2E0W', 'B', '63', '574'))),
 (465, ('PF00232', ('4EK7', 'B', '18', '513'))),
 (468, ('PF01204', ('2WYN', 'D', '55', '533'))),
 (470, ('PF07969', ('5T5M', 'A', '40', '510'))),
 (480, ('PF01266', ('4REP', 'A', '3', '482'))),
 (481, ('PF06202', ('5D0F', 'B', '1037', '1517'))),
 (482, ('PF01532', ('1G6I', 'A', '45', '532'))),
 (485, ('PF00202', ('5DDW', 'D', '20', '509'))),
 (491, ('PF05221', ('5TLS', 'D', '4', '494'))),
 (493, ('PF07748', ('2WYI', 'B', '402', '897'))),
 (496, ('PF00245', ('3WBH', 'B', '3', '498'))),
 (497, ('PF00759', ('1UT9', 'A', '308', '808'))),
 (498, ('PF03200', ('4J5T', 'A', '285', '799'))),
 (498, ('PF04998', ('5M3M', 'A', '989', '1611'))),
 (501, ('PF07971', ('2WZS', 'H', '235', '735'))),
 (504, ('PF02446', ('4S3R', 'A', '148', '667'))),
 (516, ('PF00463', ('5E9H', 'B', '23', '545'))),
 (519, ('PF00135', ('3VKF', 'B', '52', '626'))),
 (519, ('PF00899', ('3GZN', 'C', '20', '540'))),
 (524, ('PF00342', ('3QKI', 'C', '49', '573'))),
 (539, ('PF01593', ('5LBQ', 'A', '288', '826'))),
 (543, ('PF05787', ('4AMF', 'B', '11', '553'))),
 (546, ('PF03098', ('5K1E', 'A', '21', '567'))),
 (548, ('PF00562', ('5IPN', 'C', '717', '1264'))),
 (549, ('PF00374', ('5D51', 'L', '53', '603'))),
 (555, ('PF02738', ('2W55', 'H', '135', '705'))),
 (639, ('PF00063', ('2X51', 'A', '59', '760'))),
 (642, ('PF03028', ('3VKH', 'B', '4007', '4725'))),
 (656, ('PF00305', ('5EEO', 'A', '157', '822'))),
 (676, ('PF00723', ('1ULV', 'A', '281', '956')))]

In [ ]: